{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Atividades\n",
    "\n",
    "1. Utilizamos a medida de Entropia como fator de decisão (medida de impureza de um nó). Teste o mesmo conjunto \n",
    "randômico de dados para a medida Gini e compare os resultados.\n",
    "Ref1.: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier\n",
    "Ref2.: https://en.wikipedia.org/wiki/Decision_tree_learning\n",
    "\n",
    "2. Faça o balanceamento dos dados contidos em \"train.csv\", aplique o algoritmo de Decision Tree e faça a submissão no kaggle. Tente melhorar o resultado obtido em sala de aula (posição 3100 no leaderboard).\n",
    "Dataset: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction\n",
    "\n",
    "3. (Opcional) Execute uma Random Forest na competição do Kaggle e veja se a acurácia melhora. Utilize 10, 100 ou 1000 árvores (dependendo de quanto o seu computador aguentar =]): http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "#In order to install imblearn package\n",
    "#!conda install -c glemaitre imbalanced-learn\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data = pd.read_csv(\"train.csv\")\n",
    "test_data = pd.read_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    401631\n",
       "1     15017\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = train_data.iloc[:,2:]\n",
    "y = train_data.iloc[:,1]\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)\n",
    "class_count = y_train.value_counts()\n",
    "class_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original dataset shape Counter({0: 401631, 1: 15017})\n",
      "Resampled dataset shape Counter({0: 401631, 1: 401631})\n"
     ]
    }
   ],
   "source": [
    "#Balanceando os dados, removendo rows da classe 0 \n",
    "#train = x_train.copy()\n",
    "#train = train.join(y_train.copy())\n",
    "#train_df = pd.DataFrame(data=train)\n",
    "#train_class0 = train[train.target == 0]\n",
    "#train_class1 = train[train.target == 1]\n",
    "#np.random.seed(10)\n",
    "#remove_n = class_count[0] - (3*class_count[1])\n",
    "#drop_indices = np.random.choice(train_class0.index, remove_n, replace=False)\n",
    "#train_class0 = train_class0.drop(drop_indices)\n",
    "\n",
    "# Since removing data did not work, I am gonna try to add fake data in order to balance the dataset\n",
    "print('Original dataset shape {}'.format(Counter(y_train)))\n",
    "sm = SMOTE(random_state=42)\n",
    "X_res, y_res = sm.fit_sample(x_train, y_train)\n",
    "print('Resampled dataset shape {}'.format(Counter(y_res)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.917519768822 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Trainning dataset with decision tree model\n",
    "d_tree = DecisionTreeClassifier(criterion='gini')\n",
    "d_tree.fit(X_res, y_res)\n",
    "y_hat = d_tree.predict(x_test)\n",
    "\n",
    "print(\"Score:\",d_tree.score(x_test, y_test),\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False Positives:  8501 \n",
      "False Negatives:  6227\n"
     ]
    }
   ],
   "source": [
    "#Just to check whether the model is actually trainned\n",
    "CM = confusion_matrix(y_test, y_hat)\n",
    "\n",
    "TN = CM[0][0]\n",
    "FN = CM[1][0]\n",
    "TP = CM[1][1]\n",
    "FP = CM[0][1]\n",
    "\n",
    "print(\"False Positives: \", FP, \"\\nFalse Negatives: \", FN)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.961033578997 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rfc = RandomForestClassifier(n_estimators=5, random_state=0, criterion='entropy')\n",
    "rfc.fit(X_res, y_res)\n",
    "y_hatRfc = rfc.predict(x_test)\n",
    "\n",
    "print(\"Score:\",rfc.score(x_test, y_test),\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False Positives:  311 \n",
      "False Negatives:  6647\n"
     ]
    }
   ],
   "source": [
    "CMRfc = confusion_matrix(y_test, y_hatRfc)\n",
    "\n",
    "TN = CMRfc[0][0]\n",
    "FN = CMRfc[1][0]\n",
    "TP = CMRfc[1][1]\n",
    "FP = CMRfc[0][1]\n",
    "\n",
    "print(\"False Positives: \", FP, \"\\nFalse Negatives: \", FN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    171887\n",
       "1      6677\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#I could not make these models learn to properly classify the class 1 :("
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
